{
 "cells": [
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-21T16:32:46.474583Z",
     "start_time": "2024-08-21T16:32:39.480718Z"
    }
   },
   "source": "%use krangl@2fcf74dfbbe382f1803d1ab9e4739439e1f5671b",
   "outputs": [],
   "execution_count": 1
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr></table></body></html>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// in memory\n",
    "val df: DataFrame = dataFrameOf(\n",
    "    \"first_name\", \"last_name\", \"age\", \"weight\")(\n",
    "    \"Max\", \"Doe\", 23, 55,\n",
    "    \"Franz\", \"Smith\", 23, 88,\n",
    "    \"Horst\", \"Keanes\", 12, 82\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th><th style=\"text-align:left\">salary_category</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td><td style=\"text-align:left\" title=\"3\">3</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td><td style=\"text-align:left\" title=\"3\">3</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td><td style=\"text-align:left\" title=\"3\">3</td></tr></table></body></html>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.addColumn(\"salary_category\") { 3 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th><th style=\"text-align:left\">age_3y_later</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td><td style=\"text-align:left\" title=\"26\">26</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td><td style=\"text-align:left\" title=\"26\">26</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td><td style=\"text-align:left\" title=\"15\">15</td></tr></table></body></html>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.addColumn(\"age_3y_later\") { it[\"age\"] + 3 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th><th style=\"text-align:left\">full_name</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td><td style=\"text-align:left\" title=\"Max Doe\">Max Doe</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td><td style=\"text-align:left\" title=\"Franz Smith\">Franz Smith</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td><td style=\"text-align:left\" title=\"Horst Keanes\">Horst Keanes</td></tr></table></body></html>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Note: krangl dataframes are immutable so we need to (re)assign results to preserve changes.\n",
    "val newDF = df.addColumn(\"full_name\") { it[\"first_name\"] + \" \" + it[\"last_name\"] }\n",
    "newDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th><th style=\"text-align:left\">user_id</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td><td style=\"text-align:left\" title=\"Doe_id1\">Doe_id1</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td><td style=\"text-align:left\" title=\"Smith_id2\">Smith_id2</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td><td style=\"text-align:left\" title=\"Keanes_id3\">Keanes_id3</td></tr></table></body></html>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Also feel free to mix types here since krangl overloads  arithmetic operators like + for dataframe-columns\n",
    "df.addColumn(\"user_id\") { it[\"last_name\"] + \"_id\" + rowNumber }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th><th style=\"text-align:left\">with_anz</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td><td style=\"text-align:left\" title=\"false\">false</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td><td style=\"text-align:left\" title=\"true\">true</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td><td style=\"text-align:left\" title=\"false\">false</td></tr></table></body></html>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Create new attributes with string operations like matching, splitting or extraction.\n",
    "df.addColumn(\"with_anz\") { it[\"first_name\"].asStrings().map { it!!.contains(\"anz\") } }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th><th style=\"text-align:left\">first_name_initial</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td><td style=\"text-align:left\" title=\"M\">M</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td><td style=\"text-align:left\" title=\"F\">F</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td><td style=\"text-align:left\" title=\"H\">H</td></tr></table></body></html>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Note: krangl is using 'null' as missing value, and provides convenience methods to process non-NA bits\n",
    "df.addColumn(\"first_name_initial\") { it[\"first_name\"].map<String>{ it.first() } }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th><th style=\"text-align:left\">age_plus3</th><th style=\"text-align:left\">initials</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td><td style=\"text-align:left\" title=\"26\">26</td><td style=\"text-align:left\" title=\"M D\">M D</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td><td style=\"text-align:left\" title=\"26\">26</td><td style=\"text-align:left\" title=\"F S\">F S</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td><td style=\"text-align:left\" title=\"15\">15</td><td style=\"text-align:left\" title=\"H K\">H K</td></tr></table></body></html>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
    "\n",
    "// or add multiple columns at once\n",
    "df.addColumns(\n",
    "    \"age_plus3\" to { it[\"age\"] + 3 },\n",
    "    \"initials\" to { it[\"first_name\"].map<String> { it.first() } concat it[\"last_name\"].map<String> { it.first() } }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sort"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr></table></body></html>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Sort your data with sortedBy\n",
    "df.sortedBy(\"age\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr></table></body></html>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// and add secondary sorting attributes as varargs\n",
    "df.sortedBy(\"age\", \"weight\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr></table></body></html>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sortedByDescending(\"age\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr></table></body></html>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sortedBy { it[\"weight\"].asInts() }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr><tr><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr></table></body></html>"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Subset columns with select\n",
    "df.selectIf { it is IntCol } // functional style column selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"88\">88</td></tr><tr><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"82\">82</td></tr></table></body></html>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.select(\"last_name\", \"weight\")    // positive selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td></tr></table></body></html>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.remove(\"weight\", \"age\")  // negative selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td></tr></table></body></html>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.select({ endsWith(\"name\") })    // selector mini-language"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr></table></body></html>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Subset rows with vectorized filter\n",
    "df.filter { it[\"age\"] eq 23 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr></table></body></html>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.filter { it[\"weight\"] gt 60 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr></table></body></html>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.filter({ it[\"last_name\"].isMatching<String> { startsWith(\"Do\")  }})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr></table></body></html>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// In case vectorized operations are not possible or available we can also filter tables by row\n",
    "// which allows for scalar operators\n",
    "df.filterByRow { it[\"age\"] as Int > 20 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr></table></body></html>"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.filterByRow { (it[\"age\"] as Int).rem(10) == 2 }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Summarize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">age</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">n</th></tr><tr><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"1\">1</td></tr><tr><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"1\">1</td></tr><tr><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"1\">1</td></tr></table></body></html>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// do simple cross tabulations\n",
    "df.count(\"age\", \"last_name\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">mean_age</th></tr><tr><td style=\"text-align:left\" title=\"19.333333333333332\">19.333333333333332</td></tr></table></body></html>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
    "\n",
    "// ... or calculate single summary statistic\n",
    "df.summarize(\"mean_age\" to { it[\"age\"].mean(true) })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">min_age</th><th style=\"text-align:left\">max_age</th></tr><tr><td style=\"text-align:left\" title=\"12.0\">12.0</td><td style=\"text-align:left\" title=\"23.0\">23.0</td></tr></table></body></html>"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
    "\n",
    "// ... or multiple summary statistics\n",
    "df.summarize(\n",
    "    \"min_age\" to { it[\"age\"].min() },\n",
    "    \"max_age\" to { it[\"age\"].max() }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">min_age</th><th style=\"text-align:left\">max_age</th></tr><tr><td style=\"text-align:left\" title=\"12.0\">12.0</td><td style=\"text-align:left\" title=\"23.0\">23.0</td></tr></table></body></html>"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// for sake of r and python adoptability you can also use `=` here\n",
    "df.summarize(\n",
    "    \"min_age\" `=` { it[\"age\"].min() },\n",
    "    \"max_age\" `=` { it[\"age\"].max() }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">age</th><th style=\"text-align:left\">mean_weight</th><th style=\"text-align:left\">num_persons</th></tr><tr><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"71.5\">71.5</td><td style=\"text-align:left\" title=\"2\">2</td></tr><tr><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82.0\">82.0</td><td style=\"text-align:left\" title=\"1\">1</td></tr></table></body></html>"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import krangl.* // Workaround for https://github.com/Kotlin/kotlin-jupyter/issues/25\n",
    "\n",
    "// Grouped operations\n",
    "val groupedDf: DataFrame = df.groupBy(\"age\") // or provide multiple grouping attributes with varargs\n",
    "\n",
    "val sumDF = groupedDf.summarize(\n",
    "    \"mean_weight\" to { it[\"weight\"].mean(removeNA = true) },\n",
    "    \"num_persons\" to { nrow }\n",
    ")\n",
    "sumDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">first_name</th><th style=\"text-align:left\">last_name</th><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"Max\">Max</td><td style=\"text-align:left\" title=\"Doe\">Doe</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"55\">55</td></tr><tr><td style=\"text-align:left\" title=\"Franz\">Franz</td><td style=\"text-align:left\" title=\"Smith\">Smith</td><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"88\">88</td></tr><tr><td style=\"text-align:left\" title=\"Horst\">Horst</td><td style=\"text-align:left\" title=\"Keanes\">Keanes</td><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82\">82</td></tr></table></body></html>"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Optionally ungroup the data\n",
    "groupedDf.ungroup()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Person(age=23, mean_weight=71.5, num_persons=2), Person(age=12, mean_weight=82.0, num_persons=1)]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Generate object bindings for kotlin.\n",
    "data class Person(val age: Int, val mean_weight: Double, val num_persons: Int)\n",
    "val records = sumDF.rowsAs<Person>()\n",
    "\n",
    "records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "71.5"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Now we can use the krangl result table in a strongly typed way\n",
    "records.first().mean_weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">age</th><th style=\"text-align:left\">mean_weight</th><th style=\"text-align:left\">num_persons</th></tr><tr><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"71.5\">71.5</td><td style=\"text-align:left\" title=\"2\">2</td></tr><tr><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82.0\">82.0</td><td style=\"text-align:left\" title=\"1\">1</td></tr></table></body></html>"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Vice versa we can also convert an existing set of objects into\n",
    "records.asDataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<html><body><table><tr><th style=\"text-align:left\">age</th><th style=\"text-align:left\">weight</th></tr><tr><td style=\"text-align:left\" title=\"23\">23</td><td style=\"text-align:left\" title=\"71.5\">71.5</td></tr><tr><td style=\"text-align:left\" title=\"12\">12</td><td style=\"text-align:left\" title=\"82.0\">82.0</td></tr></table></body></html>"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// to populate a data-frame with selected properties only, we can do\n",
    "records.deparseRecords { mapOf(\"age\" to it.age, \"weight\" to it.mean_weight) }"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Kotlin",
   "language": "kotlin",
   "name": "kotlin"
  },
  "language_info": {
   "codemirror_mode": "text/x-kotlin",
   "file_extension": "kt",
   "name": "kotlin"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
